import requests
from bs4 import BeautifulSoup
import os

LinkList = []
# 总共25800条数据 每页50条数据
str_host = 'http://t66y.com/thread0806.php?fid=8&search=&page='
host = 'http://t66y.com/'
#下载目录
root_path = 'D:/test1/'


# 获取服务器的页面数据
def get_index(url):
    try:
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept - Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Cookie': "__cfduid=d921c7b3ae191fd6f6db4b50160c6e5281608047980; 227c9_lastvisit=0%091608048609%09%2Fthread0806.php%3Ffid%3D8%26search%3D%26page%3D2",
            'Host': "t66y.com",
            'If-Modified-Since': "Tue, 15 Dec 2020 15:59:39 GMT",
            'If-None-Match': "4308-5b682d89ca9c0-gzip",
            'Proxy-Connection': 'Keep-Alive',
            'Referer': 'http://t66y.com/thread0806.php?fid=8',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Mobile Safari/537.36'}
        keyword = {
            'fid': 8,
            'search': "",
            'page': '2'
        }
        respose = requests.get(url, headers=headers, timeout=29)
        respose.encoding = 'utf-8'
        if respose.status_code == 200:
            # print(respose.text)
            return respose.text
    except Exception as e:
        print('获取网页数据异常：'+str(e))
        return None


# 获取服务器的页面数据
def get_index1(url, page):
    try:

        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Cookie': "__cfduid=d921c7b3ae191fd6f6db4b50160c6e5281608047980; 227c9_lastvisit=0%091608048609%09%2Fthread0806.php%3Ffid%3D8%26search%3D%26page%3D2",
            'Host': "t66y.com",
            'Proxy-Connection': 'Keep-Alive',
            'Referer': 'http://t66y.com/thread0806.php?fid=8',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Mobile Safari/537.36'}
        keyword = {
            'fid': 8,
            'search': "",
            'page': page,
        }
        respose = requests.get(url, params=keyword, headers=headers, timeout=29)
        respose.encoding = 'utf-8'
        if respose.status_code == 200:
            return respose.text
    except Exception as e:
        print('获取网页数据异常：'+str(e))
        return None


# 获取页面的所有连接
def getPageUrl(text):
    try:
        soup = BeautifulSoup(text, "html.parser")
        pid = soup.findAll('a', {'target': '_blank'})
        print(len(pid))
        for aa in pid:
            href = aa.get('href')
            href = host + href
            if href not in LinkLDist:
                if href.endswith("html"):
                    LinkList.append(href)
            print(LinkList)
        print(len(LinkList))
    except Exception as e:
        print('error' + str(e))


# 获取帖子里面的图片
def getAllImgUrl():
    # 从文件中获取链接
    # LinkList = MyFileUtil.myred()
    print(LinkList)
    for url in LinkList:
        myLink, title_name = GetPageImgUrl(url)
        print('开始爬取：' + title_name + ' 的图片')
        # print(myLink)
        for img in myLink:
            download_img(img, title_name)
    # print(len(LinkListImg))

# <img iyl-data="http://a.d/adblo_ck.jpg" ess-data="https://www.sxotu.xyz/i/2020/12/15/12vbo9j.jpg" src="https://www.sxotu.xyz/i/2020/12/15/12vbo9j.jpg" style="cursor: pointer;">
# 获取页面的所有图片连接
def GetPageImgUrl(url):
    LinkListImg = []
    try:
        text = get_index(url)
        # print(text)
        soup = BeautifulSoup(text, "html.parser")
        title_name = soup.findAll('h4')[0].string
        pid = soup.findAll('img', attrs={'iyl-data': 'http://a.d/adblo_ck.jpg'})
        print('页面图片数量:%d', len(pid))
        for aa in pid:
            href = aa.get('data-link')
            # print(href)
            if href not in LinkListImg:
                LinkListImg.append(href)
            # print(LinkList)
    except Exception as e:
        print('error:' + str(e))
    return LinkListImg,title_name

# 获取所有页面连接
def getAllPageUrl():
    # getPageUrl(get_index(str_host + str(0)))
    # for i in range(0, 25800, 50):

    # for i in range(1, 116, 1):
    #     print('=======================================================================================================')
    #     print('虚幻虫爬取页数：'+str(i))
    #     print('=======================================================================================================')
    #     LinkList.clear()
    #     getPageUrl(get_index(str_host+str(i)))
    #     getAllImgUrl()

    getPageUrl(get_index(str_host + str(2), str(2)))
    getAllImgUrl()



#  下载图片
def download_img(url, pathname):
    root = root_path + pathname+"/"
    path = root + url.split("/")[-1]
    print(url)
    try:
        if not os.path.exists(root):
            os.makedirs(root)
        if not os.path.exists(path):
            r = requests.get(url)
            r.raise_for_status()
            print('picture name :'+path + ' dir path :'+root)
            # 使用with语句可以不用自己手动关闭已经打开的文件流
            with open(path, "wb") as f:  # 开始写文件，wb代表写二进制文件
                f.write(r.content)
            print("爬取完成")
        else:
            print("文件已存在")
    except Exception as e:
        print("爬取失败:" + str(e))


# 获取单篇文章
def get_deicide_page():
    url = r'https://www.t66y.com/htm_data/2309/8/5950057.html'
    myLink, title_name = GetPageImgUrl(url)
    print('开始爬取：' + title_name + ' 的图片')
    # print(myLink)
    for img in myLink:
        download_img(img, title_name)


if __name__ == '__main__':
    # getAllPageUrl()
    get_deicide_page()

